library("readxl")
library("dplyr")
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library("ggplot2")
library("rockchalk")
##
## Attaching package: 'rockchalk'
## The following object is masked from 'package:dplyr':
##
## summarize
library("Hmisc")
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following object is masked from 'package:rockchalk':
##
## summarize
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
library("psych")
##
## Attaching package: 'psych'
## The following object is masked from 'package:Hmisc':
##
## describe
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(olsrr)
## Registered S3 methods overwritten by 'car':
## method from
## influence.merMod lme4
## cooks.distance.influence.merMod lme4
## dfbeta.influence.merMod lme4
## dfbetas.influence.merMod lme4
##
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
##
## rivers
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
## The following object is masked from 'package:dplyr':
##
## recode
library(lm.beta)
library(userfriendlyscience)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
##
## Attaching package: 'userfriendlyscience'
## The following object is masked from 'package:Hmisc':
##
## escapeRegex
## The following object is masked from 'package:lattice':
##
## oneway
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
Reading excel file
academic <- read_excel("dataAcademic.xlsx")
## New names:
## * `` -> ...10
Size of the dataset
cat("Dimension of data", dim(academic),"\n")
## Dimension of data 12411 45
cat("Total number of rows",nrow(academic),fill=TRUE)
## Total number of rows 12411
cat("Total number of columns",ncol(academic),fill=TRUE)
## Total number of columns 45
Extracting sample data : Total 10% of the sample is extracted from the population. Since the observation is extracted randomly we can say that it is representative of the entire population.
sample_academic <- dplyr::sample_n(academic, size=ceiling((0.1*nrow(academic)))
)
sample_academic
## # A tibble: 1,242 x 45
## COD_S11 GENDER EDU_FATHER EDU_MOTHER OCC_FATHER OCC_MOTHER STRATUM SISBEN
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 SB1120… F Complete … Complete … Operator Other occ… Stratu… Level…
## 2 SB1120… M Incomplet… Incomplet… Independe… Home Stratu… Level…
## 3 SB1120… F Complete … Complete … Entrepren… Home Stratu… It is…
## 4 SB1120… F Complete … Complete … 0 Technical… Stratu… It is…
## 5 SB1120… M Complete … Incomplet… Other occ… Home Stratu… It is…
## 6 SB1120… M Complete … Complete … Independe… Independe… Stratu… It is…
## 7 SB1120… F Complete … Complete … Independe… Independe… Stratu… Level…
## 8 SB1120… F Complete … Incomplet… Operator Home Stratu… Level…
## 9 SB1120… M Incomplet… Complete … Independe… Operator Stratu… Level…
## 10 SB1120… F Complete … Complete … Technical… Home Stratu… It is…
## # … with 1,232 more rows, and 37 more variables: PEOPLE_HOUSE <chr>,
## # ...10 <lgl>, INTERNET <chr>, TV <chr>, COMPUTER <chr>, WASHING_MCH <chr>,
## # MIC_OVEN <chr>, CAR <chr>, DVD <chr>, FRESH <chr>, PHONE <chr>,
## # MOBILE <chr>, REVENUE <chr>, JOB <chr>, SCHOOL_NAME <chr>,
## # SCHOOL_NAT <chr>, SCHOOL_TYPE <chr>, MAT_S11 <dbl>, CR_S11 <dbl>,
## # CC_S11 <dbl>, BIO_S11 <dbl>, ENG_S11 <dbl>, Cod_SPro <chr>,
## # UNIVERSITY <chr>, ACADEMIC_PROGRAM <chr>, QR_PRO <dbl>, CR_PRO <dbl>,
## # CC_PRO <dbl>, ENG_PRO <dbl>, WC_PRO <dbl>, FEP_PRO <dbl>, G_SC <dbl>,
## # PERCENTILE <dbl>, `2ND_DECILE` <dbl>, QUARTILE <dbl>, SEL <dbl>,
## # SEL_IHE <dbl>
Variables of interest and their description: Mathematics, Biology, Written Communication, Global Score, Percentile,TV, REVENUE, FRESH
varInterest <- c("MAT_S11","BIO_S11","WC_PRO","G_SC","PERCENTILE","TV","REVENUE","ACADEMIC_PROGRAM","FRESH")
Assessing for missing data
cat("Missing observations in the entire dataset",sum(is.na(academic)))
## Missing observations in the entire dataset 12411
Total number of missing points in the entire dataset is 12411, which indicates an entire column. When the dataset is observed carefully, an entire column was missing.
Summary of variable of interest from the sample.
sdatasubset<-academic[varInterest]#subset considering variables of interest
summary(sdatasubset) #summary of variables interest.
## MAT_S11 BIO_S11 WC_PRO G_SC
## Min. : 26.00 Min. : 11.00 Min. : 0.0 Min. : 37.0
## 1st Qu.: 56.00 1st Qu.: 56.00 1st Qu.: 28.0 1st Qu.:147.0
## Median : 64.00 Median : 64.00 Median : 56.0 Median :163.0
## Mean : 64.32 Mean : 63.95 Mean : 53.7 Mean :162.7
## 3rd Qu.: 72.00 3rd Qu.: 71.00 3rd Qu.: 80.0 3rd Qu.:179.0
## Max. :100.00 Max. :100.00 Max. :100.0 Max. :247.0
## PERCENTILE TV REVENUE ACADEMIC_PROGRAM
## Min. : 1.00 Length:12411 Length:12411 Length:12411
## 1st Qu.: 51.00 Class :character Class :character Class :character
## Median : 75.00 Mode :character Mode :character Mode :character
## Mean : 68.45
## 3rd Qu.: 90.00
## Max. :100.00
## FRESH
## Length:12411
## Class :character
## Mode :character
##
##
##
The following code represents if there is any variables of interest missing in the sample.
res<-summary(VIM::aggr(sdatasubset, sortVar=TRUE))$combinations
##
## Variables sorted by number of missings:
## Variable Count
## MAT_S11 0
## BIO_S11 0
## WC_PRO 0
## G_SC 0
## PERCENTILE 0
## TV 0
## REVENUE 0
## ACADEMIC_PROGRAM 0
## FRESH 0
Description statistics for each general type of variable
Nomimal - Academic program is nominal data.
facAcademic <-factor(academic[varInterest][8])
facAcademic
## ACADEMIC_PROGRAM
## <NA>
## Levels: c("INDUSTRIAL ENGINEERING", "ELECTRONIC ENGINEERING", "CIVIL ENGINEERING", "MECHANICAL ENGINEERING", "ELECTRIC ENGINEERING", "ELECTRIC ENGINEERING AND TELECOMMUNICATIONS", "CHEMICAL ENGINEERING", "AERONAUTICAL ENGINEERING", "MECHATRONICS ENGINEERING", "INDUSTRIAL AUTOMATIC ENGINEERING", "TRANSPORTATION AND ROAD ENGINEERING", "TOPOGRAPHIC ENGINEERY", "INDUSTRIAL CONTROL AND AUTOMATION ENGINEERING", "CONTROL ENGINEERING", "CATASTRAL ENGINEERING AND GEODESY", "PRODUCTION ENGINEERING", "PRODUCTIVITY AND QUALITY ENGINEERING", \n"CIVIL CONSTRUCTIONS", "ELECTROMECHANICAL ENGINEERING", "AUTOMATION ENGINEERING", "TEXTILE ENGINEERING")
Frequency counts in the sample
freqAcademic <- table(academic[varInterest][8])
freqAcademic # display total number of each categorical variable.
##
## AERONAUTICAL ENGINEERING
## 44
## AUTOMATION ENGINEERING
## 10
## CATASTRAL ENGINEERING AND GEODESY
## 78
## CHEMICAL ENGINEERING
## 1000
## CIVIL CONSTRUCTIONS
## 14
## CIVIL ENGINEERING
## 3320
## CONTROL ENGINEERING
## 20
## ELECTRIC ENGINEERING
## 278
## ELECTRIC ENGINEERING AND TELECOMMUNICATIONS
## 47
## ELECTROMECHANICAL ENGINEERING
## 34
## ELECTRONIC ENGINEERING
## 849
## INDUSTRIAL AUTOMATIC ENGINEERING
## 22
## INDUSTRIAL CONTROL AND AUTOMATION ENGINEERING
## 1
## INDUSTRIAL ENGINEERING
## 5318
## MECHANICAL ENGINEERING
## 1135
## MECHATRONICS ENGINEERING
## 82
## PRODUCTION ENGINEERING
## 60
## PRODUCTIVITY AND QUALITY ENGINEERING
## 29
## TEXTILE ENGINEERING
## 1
## TOPOGRAPHIC ENGINEERY
## 42
## TRANSPORTATION AND ROAD ENGINEERING
## 27
round(prop.table(freqAcademic),digits=2) # proportian from the frequency table
##
## AERONAUTICAL ENGINEERING
## 0.00
## AUTOMATION ENGINEERING
## 0.00
## CATASTRAL ENGINEERING AND GEODESY
## 0.01
## CHEMICAL ENGINEERING
## 0.08
## CIVIL CONSTRUCTIONS
## 0.00
## CIVIL ENGINEERING
## 0.27
## CONTROL ENGINEERING
## 0.00
## ELECTRIC ENGINEERING
## 0.02
## ELECTRIC ENGINEERING AND TELECOMMUNICATIONS
## 0.00
## ELECTROMECHANICAL ENGINEERING
## 0.00
## ELECTRONIC ENGINEERING
## 0.07
## INDUSTRIAL AUTOMATIC ENGINEERING
## 0.00
## INDUSTRIAL CONTROL AND AUTOMATION ENGINEERING
## 0.00
## INDUSTRIAL ENGINEERING
## 0.43
## MECHANICAL ENGINEERING
## 0.09
## MECHATRONICS ENGINEERING
## 0.01
## PRODUCTION ENGINEERING
## 0.00
## PRODUCTIVITY AND QUALITY ENGINEERING
## 0.00
## TEXTILE ENGINEERING
## 0.00
## TOPOGRAPHIC ENGINEERY
## 0.00
## TRANSPORTATION AND ROAD ENGINEERING
## 0.00
Ordinal : Revenue is ordinal data.
freqRevenue <- table(academic[varInterest][7])
freqRevenue
##
## 0 10 or more LMMW
## 279 718
## Between 1 and less than 2 LMMW Between 2 and less than 3 LMMW
## 3873 2783
## Between 3 and less than 5 LMMW Between 5 and less than 7 LMMW
## 2239 973
## Between 7 and less than 10 LMMW less than 1 LMMW
## 509 1037
round(prop.table(freqRevenue),digits=2) # proportian from the frequency table
##
## 0 10 or more LMMW
## 0.02 0.06
## Between 1 and less than 2 LMMW Between 2 and less than 3 LMMW
## 0.31 0.22
## Between 3 and less than 5 LMMW Between 5 and less than 7 LMMW
## 0.18 0.08
## Between 7 and less than 10 LMMW less than 1 LMMW
## 0.04 0.08
Interval - MAT_S11 is example for interval data. Maths score is normally distributed, therefore mean, count, standard deviation is derived.
cat("Range of math's score in the sample",range(academic[varInterest][1]),"\n")
## Range of math's score in the sample 26 100
cat("Summay of math's score")
## Summay of math's score
summary(academic[varInterest][1],"\n")
## MAT_S11
## Min. : 26.00
## 1st Qu.: 56.00
## Median : 64.00
## Mean : 64.32
## 3rd Qu.: 72.00
## Max. :100.00
cat("Mean of math's score in the sample",mean(as.numeric(unlist(academic['MAT_S11']))),"\n")
## Mean of math's score in the sample 64.32076
cat("Standard deviation of math's score",
sd(as.numeric(unlist(academic['MAT_S11']))),"\n")
## Standard deviation of math's score 11.87365
cat("Minimum of math's score",
min(as.numeric(unlist(academic['MAT_S11']))),"\n")
## Minimum of math's score 26
cat("Maximum of math's score",
max(as.numeric(unlist(academic['MAT_S11']))),"\n")
## Maximum of math's score 100
psych::describe(as.numeric(unlist(academic[varInterest][1])))
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 12411 64.32 11.87 64 63.84 11.86 26 100 74 0.4 0.13 0.11
Hmisc::describe(academic[varInterest][1])
## academic[varInterest][1]
##
## 1 Variables 12411 Observations
## --------------------------------------------------------------------------------
## MAT_S11
## n missing distinct Info Mean Gmd .05 .10
## 12411 0 69 0.999 64.32 13.33 47 50
## .25 .50 .75 .90 .95
## 56 64 72 81 86
##
## lowest : 26 31 32 35 36, highest: 96 97 98 99 100
## --------------------------------------------------------------------------------
pastecs::stat.desc(academic[varInterest][1], basic=F)
## MAT_S11
## median 64.0000000
## mean 64.3207638
## SE.mean 0.1065813
## CI.mean.0.95 0.2089158
## var 140.9835648
## std.dev 11.8736500
## coef.var 0.1846006
Visualization
Nominal data - Academic program
unique(academic[varInterest][8])
## # A tibble: 21 x 1
## ACADEMIC_PROGRAM
## <chr>
## 1 INDUSTRIAL ENGINEERING
## 2 ELECTRONIC ENGINEERING
## 3 CIVIL ENGINEERING
## 4 MECHANICAL ENGINEERING
## 5 ELECTRIC ENGINEERING
## 6 ELECTRIC ENGINEERING AND TELECOMMUNICATIONS
## 7 CHEMICAL ENGINEERING
## 8 AERONAUTICAL ENGINEERING
## 9 MECHATRONICS ENGINEERING
## 10 INDUSTRIAL AUTOMATIC ENGINEERING
## # … with 11 more rows
x <- table(academic[varInterest][8])
barplot(x,
main = "Student academic program",
xlab = "Academic program",
col = rainbow(length(x)), names.arg=(c("AE","Auto","CEG","ChE","CC","CE","CoE","EE","EET","ElE","EltE","IAE","ICAE","IE","ME","MecE","PE","PQE","TE","ToE","TRE")) )
piepercent<- round(100*x/sum(x), 1)
cat("Percentage of students who live in rural and urban area: \n")
## Percentage of students who live in rural and urban area:
print(piepercent)
##
## AERONAUTICAL ENGINEERING
## 0.4
## AUTOMATION ENGINEERING
## 0.1
## CATASTRAL ENGINEERING AND GEODESY
## 0.6
## CHEMICAL ENGINEERING
## 8.1
## CIVIL CONSTRUCTIONS
## 0.1
## CIVIL ENGINEERING
## 26.8
## CONTROL ENGINEERING
## 0.2
## ELECTRIC ENGINEERING
## 2.2
## ELECTRIC ENGINEERING AND TELECOMMUNICATIONS
## 0.4
## ELECTROMECHANICAL ENGINEERING
## 0.3
## ELECTRONIC ENGINEERING
## 6.8
## INDUSTRIAL AUTOMATIC ENGINEERING
## 0.2
## INDUSTRIAL CONTROL AND AUTOMATION ENGINEERING
## 0.0
## INDUSTRIAL ENGINEERING
## 42.8
## MECHANICAL ENGINEERING
## 9.1
## MECHATRONICS ENGINEERING
## 0.7
## PRODUCTION ENGINEERING
## 0.5
## PRODUCTIVITY AND QUALITY ENGINEERING
## 0.2
## TEXTILE ENGINEERING
## 0.0
## TOPOGRAPHIC ENGINEERY
## 0.3
## TRANSPORTATION AND ROAD ENGINEERING
## 0.2
pie(table(academic[varInterest][8]), labels = c("AE","CEG","ChE","CC","CE","CoE","EE","EET","ElE","EltE","IE","ME","MecE","PE","PQE","TE","TRE"), main = "Student academic program",col = rainbow(length(x)))
Ordinal data - Revenue
x <- table(academic[varInterest][7])
barplot(table(academic[varInterest][7]),
main = "Student academic program",
xlab = "Revenue",
col = rainbow(length(x)), names.arg=(c("Zero","10 or more","1 & 2","2 & 3","3 & 5","5 & 7 ","7 & 10","< 1")) )
piepercent<- round(100*x/sum(x), 1)
cat("Percentage of students who live in rural and urban area: \n")
## Percentage of students who live in rural and urban area:
print(piepercent)
##
## 0 10 or more LMMW
## 2.2 5.8
## Between 1 and less than 2 LMMW Between 2 and less than 3 LMMW
## 31.2 22.4
## Between 3 and less than 5 LMMW Between 5 and less than 7 LMMW
## 18.0 7.8
## Between 7 and less than 10 LMMW less than 1 LMMW
## 4.1 8.4
pie(table(academic[varInterest][8]), labels = c("Zero","10 or more","1 & 2","2 & 3","3 & 5","5 & 7 ","7 & 10","< 1"), main = "Student academic program",col = rainbow(length(x)))
Interval data - MAT_S11 i.e. score obtained in mathematics.
gs <- ggplot(academic, aes(x=MAT_S11))
gs <- gs + labs(x="Maths score")
gs <- gs + geom_histogram(binwidth=2, colour="black", aes(y=..density.., fill=..count..))
gs <- gs + scale_fill_gradient("Count", low="#DCDCDC", high="#7C7C7C")
gs <- gs + stat_function(fun=dnorm, color="red",args=list(mean=mean(as.numeric(unlist(academic['MAT_S11'])), na.rm=TRUE), sd=sd(as.numeric(unlist(academic['MAT_S11'])), na.rm=TRUE)))
gs
Assessing variable for normality
H0 : Mathematics score is not normally distributed. H1 : Mathematics score is normally distributed.
qqnorm(academic$MAT_S11)
qqline(academic$MAT_S11) #show a line on theplot
tpskew<-semTools::skew(as.numeric(unlist(academic[varInterest][1])))
tpkurt<-semTools::kurtosis(as.numeric(unlist((academic[varInterest][1]))))
tpskew[1]/tpskew[2]
## skew (g1)
## 18.17215
tpkurt[1]/tpkurt[2]
## Excess Kur (g2)
## 2.954064
zmaths<- abs(scale(academic[varInterest][1]))
ex <- FSA::perc(as.numeric(zmaths), 1.96, "gt")
ex
## [1] 4.447667
FSA::perc(as.numeric(zmaths), 3.29, "gt")
## [1] 0
pastecs::stat.desc(academic$MAT_S11, basic=F)
## median mean SE.mean CI.mean.0.95 var std.dev
## 64.0000000 64.3207638 0.1065813 0.2089158 140.9835648 11.8736500
## coef.var
## 0.1846006
Report of normality: Maths score was assessed for normality. Visual inspection of the histogram and QQ-Plot did not exhibit any issues with skewness and kurtosis. The standardized score for skewness(18.17) and standardized score for kurtosis (2.95) are not within acceptable range using the criteria proposed by West, Finch and Curran (1996). All the data points of standardized scores for mathematics fall within the bounds +/- 3.29, using the guidance of Field, Miles and Field (2013) the data can be considered to be a normal distribution (m=63.32, sd=11.87, n=12411).
Correlation
Correlation between mathematics score and biology score is checked.
Let us consider compute normality of biology score.
H0 : Biology score is not normally distributed. H1 : Biology score is normally distributed.
gs <- ggplot(academic, aes(x=BIO_S11))
gs <- gs + labs(x="Biology score")
gs <- gs + geom_histogram(binwidth=2, colour="black", aes(y=..density.., fill=..count..))
gs <- gs + scale_fill_gradient("Count", low="#DCDCDC", high="#7C7C7C")
gs <- gs + stat_function(fun=dnorm, color="red",args=list(mean=mean(as.numeric(unlist(academic['BIO_S11'])), na.rm=TRUE), sd=sd(as.numeric(unlist(academic['BIO_S11'])), na.rm=TRUE)))
gs
qqnorm(academic$BIO_S11)
qqline(academic$BIO_S11) #show a line on theplot
tpskew<-semTools::skew(as.numeric(unlist(academic[varInterest][2])))
tpkurt<-semTools::kurtosis(as.numeric(unlist((academic[varInterest][2]))))
tpskew[1]/tpskew[2]
## skew (g1)
## 13.7976
tpkurt[1]/tpkurt[2]
## Excess Kur (g2)
## 6.800432
zmaths<- abs(scale(academic[varInterest][2]))
ex <- FSA::perc(as.numeric(zmaths), 1.96, "gt")
ex
## [1] 5.559584
FSA::perc(as.numeric(zmaths), 3.29, "gt")
## [1] 0.0322295
pastecs::stat.desc(academic$BIO_S11, basic=F)
## median mean SE.mean CI.mean.0.95 var std.dev
## 64.0000000 63.9505278 0.1001472 0.1963041 124.4757151 11.1568685
## coef.var
## 0.1744609
Report of normality: Biology score was assessed for normality. Visual inspection of the histogram and QQ-Plot did not exhibit any issues with skewness and kurtosis. The standardized score for skewness(5.78) and standardized score for kurtosis () is not within acceptable range 99.97% the datapoints of standardized scores for biology fall within the bounds +/- 3.29, using the guidance of Field, Miles and Field (2013) the data can be considered to be a normal distribution (m=63.95, sd=11.15, n=12412).
Since both the variables are normally distributed Pearson correlation is considered.
Correlation test:
H0 : There is no relationship between mathematics score and biology score. H1 : There is a relationship between mathematics score and biology score.
x <- academic$BIO_S11
y <- academic$MAT_S11
plot(x, y, main = "Correlation between maths and biology score",
xlab = "Biology", ylab = "Mathematics",
pch = 20, frame = FALSE)
abline(lm(y ~ x, data = academic), col = "blue")
cor.test(as.numeric(unlist(academic[varInterest][2])), as.numeric(unlist(academic[varInterest][1])), method = "pearson")
##
## Pearson's product-moment correlation
##
## data: as.numeric(unlist(academic[varInterest][2])) and as.numeric(unlist(academic[varInterest][1]))
## t = 131.52, df = 12409, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.7556332 0.7703337
## sample estimates:
## cor
## 0.7630821
Report correlation The relationship between mathematics score and biology was investigated using a Pearson correlation. A strong positive correlation was found (r =0.76, n=12409, p < .05).
Assessing normality of written communication.
H0 : Written communication score is not normally distributed. H1 : Written communication score is normally distributed.
gs <- ggplot(academic, aes(x=WC_PRO))
gs <- gs + labs(x="Written communication score")
gs <- gs + geom_histogram(binwidth=2, colour="black", aes(y=..density.., fill=..count..))
gs <- gs + scale_fill_gradient("Count", low="#DCDCDC", high="#7C7C7C")
gs <- gs + stat_function(fun=dnorm, color="red",args=list(mean=mean(as.numeric(unlist(academic['WC_PRO'])), na.rm=TRUE), sd=sd(as.numeric(unlist(academic['WC_PRO'])), na.rm=TRUE)))
gs
qqnorm(academic$WC_PRO)
qqline(academic$WC_PRO)
tpskew<-semTools::skew(as.numeric(unlist(academic[varInterest][3])))
tpkurt<-semTools::kurtosis(as.numeric(unlist((academic[varInterest][3]))))
tpskew[1]/tpskew[2]
## skew (g1)
## -8.12618
tpkurt[1]/tpkurt[2]
## Excess Kur (g2)
## -27.37953
zwcpro<- abs(scale(academic[varInterest][3]))
ex <- FSA::perc(as.numeric(zwcpro), 1.96, "gt")
ex
## [1] 0
FSA::perc(as.numeric(zwcpro), 3.29, "gt")
## [1] 0
pastecs::stat.desc(academic$WC_PRO, basic=F)
## median mean SE.mean CI.mean.0.95 var std.dev
## 56.0000000 53.7034083 0.2693041 0.5278778 900.1040488 30.0017341
## coef.var
## 0.5586561
Reporting normality Written communication score was assessed for normality. Visual inspection of the histogram and QQ-Plot did exhibit issues with skewness and kurtosis. The standardized score for skewness(-8.13) and standardised score for kurtosis (-27.38) is not within the acceptable range. All the data points of standardized scores for written communication fall within the bounds +/- 3.29, using the guidance of Field, Miles and Field (2013) the data can be considered to be a normal distribution (m=53.7, sd=30, n=12411).
Identifying the correlation between mathematics and written communication score
H0 : There is no relationship between mathematics score and written communication score. H1 : There is a relationship between mathematics score and written communication score.
x <- academic$WC_PRO
y <- academic$MAT_S11
plot(x, y, main = "Correlation between maths and written communication score",
xlab = "WC_PRO", ylab = "Mathematics",
pch = 20, frame = FALSE)
abline(lm(y ~ x, data = academic), col = "blue")
cor.test(as.numeric(unlist(academic[varInterest][1])), as.numeric(unlist(academic[varInterest][3])), method = "pearson")
##
## Pearson's product-moment correlation
##
## data: as.numeric(unlist(academic[varInterest][1])) and as.numeric(unlist(academic[varInterest][3]))
## t = 23.593, df = 12409, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1902951 0.2239720
## sample estimates:
## cor
## 0.207195
Reporting correlation The relationship between mathematics score and written communication was investigated using a Pearson correlation. A very weak positive correlation was found (r =0.2, n=12409, p < .05).
Checking normality of percentile
H0: Percentile is not normally distributed. H1: Percentile is normally distributed.
gs <- ggplot(academic, aes(x=PERCENTILE))
gs <- gs + labs(x="PERCENTILE")
gs <- gs + geom_histogram(binwidth=2, colour="black", aes(y=..density.., fill=..count..))
gs <- gs + scale_fill_gradient("Count", low="#DCDCDC", high="#7C7C7C")
gs <- gs + stat_function(fun=dnorm, color="red",args=list(mean=mean(as.numeric(unlist(academic['PERCENTILE'])), na.rm=TRUE), sd=sd(as.numeric(unlist(academic['PERCENTILE'])), na.rm=TRUE)))
gs
qqnorm(academic$PERCENTILE)
qqline(academic$PERCENTILE) #show a line on theplot
tpskew<-semTools::skew(as.numeric(unlist(academic[varInterest][5])))
tpkurt<-semTools::kurtosis(as.numeric(unlist((academic[varInterest][5]))))
tpskew[1]/tpskew[2]
## skew (g1)
## -34.10691
tpkurt[1]/tpkurt[2]
## Excess Kur (g2)
## -10.37217
zperc<- abs(scale(academic[varInterest][5]))
ex <- FSA::perc(as.numeric(zperc), 1.96, "gt")
ex
## [1] 5.269519
FSA::perc(as.numeric(zperc), 3.29, "gt")
## [1] 0
pastecs::stat.desc(academic$PERCENTILE, basic=F)
## median mean SE.mean CI.mean.0.95 var std.dev
## 75.0000000 68.4464588 0.2321945 0.4551372 669.1301508 25.8675502
## coef.var
## 0.3779239
Reporting normality Percentile score was assessed for normality. Visual inspection of the histogram and QQ-Plot did exhibhit issues with skewness and kurtosis. The standardized score for skewness(-34.1) and standardised score kurtosis (-10.37) is not within acceptable range. All the datapoints of standardized scores for written communication fall within the bounds +/- 3.29, using the guidance of Field, Miles and Field (2013) the data can be considered to be a normal distribution (m=68.44, sd=25.87, n=12411).
H0 : There is no relationship between global score and percentile. H1 : There is a relationship between global score and percentile .
x <- academic$G_SC
y <- academic$PERCENTILE
plot(x, y, main = "Correlation between maths and written communication score",
xlab = "G_SC", ylab = "Percentile",
pch = 20, frame = FALSE)
abline(lm(y ~ x, data = academic), col = "blue")
cor.test(as.numeric(unlist(academic[varInterest][4])), as.numeric(unlist(academic[varInterest][5])), method = "pearson")
##
## Pearson's product-moment correlation
##
## data: as.numeric(unlist(academic[varInterest][4])) and as.numeric(unlist(academic[varInterest][5]))
## t = 400.92, df = 12409, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9622178 0.9647402
## sample estimates:
## cor
## 0.9635004
Reporting correlation The relationship between global score and percentile was investigated using a Pearson correlation. A very strong positive correlation was found (r =0.96, n=12409, p < .05).
Difference test involving 2 groups
Variable TV has 2 outcomes, this indicated whether the respondent has TV or not.
H0: There is no difference between the global score those who have TV and those do not have TV. H1: There is a difference between the global score those who have TV and those do not have TV.
psych::describeBy(as.numeric(unlist(academic[varInterest][4])), academic$TV, mat=TRUE)
## item group1 vars n mean sd median trimmed mad min max
## X11 1 No 1 1842 155.4598 22.74521 155 155.5197 23.7216 77 228
## X12 2 Yes 1 10569 163.9742 22.94365 164 164.2314 23.7216 37 247
## range skew kurtosis se
## X11 151 -0.03963129 -0.13022982 0.5299625
## X12 210 -0.10536223 -0.05546219 0.2231750
car::leveneTest(G_SC ~ TV, data=academic)
## Warning in leveneTest.default(y = y, group = group, ...): group coerced to
## factor.
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 1 0.649 0.4205
## 12409
#Pr(>F) - it is not statistically significant so we can assume homogeneity
stats::t.test(G_SC ~ TV ,var.equal=TRUE,data=academic)
##
## Two Sample t-test
##
## data: G_SC by TV
## t = -14.716, df = 12409, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -9.648411 -7.380276
## sample estimates:
## mean in group No mean in group Yes
## 155.4598 163.9742
#Statistically significant difference was found
res <- stats::t.test(G_SC ~ TV ,var.equal=TRUE,data=academic)
effcd=round((2*res$statistic)/sqrt(res$parameter),2)
effectsize::t_to_d(t = res$statistic, res$parameter)
## d | 95% CI
## ----------------------
## -0.26 | [-0.30, -0.23]
#Eta squared calculation
effes=round((res$statistic*res$statistic)/((res$statistic*res$statistic)+(res$parameter)),3)
effes
## t
## 0.017
Reporting difference test
An independent-samples t-test was conducted to compare global score for respondents who have TV and those who are do not have TV Significance difference in the scores for global score who have TV (M=163.97, SD=22.94 for respondents who have TV, M=155.46, SD=22.74 for respondents who do not have TV), (t(1.2409\times 10^{4})= -14.716, p = 0. Cohen's d also indicated a very small effect size (-0.26).
Difference test for Revenue:
academic$rv <- as.numeric(factor(academic$REVENUE))
psych::describeBy(academic$G_SC, academic$REVENUE, mat=TRUE)
## item group1 vars n mean sd median
## X11 1 0 1 279 169.7168 22.63700 172
## X12 2 10 or more LMMW 1 718 183.6727 20.72211 185
## X13 3 Between 1 and less than 2 LMMW 1 3873 156.7705 22.04578 157
## X14 4 Between 2 and less than 3 LMMW 1 2783 160.9429 21.63347 162
## X15 5 Between 3 and less than 5 LMMW 1 2239 165.3363 22.07567 167
## X16 6 Between 5 and less than 7 LMMW 1 973 170.8602 21.78682 172
## X17 7 Between 7 and less than 10 LMMW 1 509 176.1768 19.96434 178
## X18 8 less than 1 LMMW 1 1037 153.3144 22.01468 153
## trimmed mad min max range skew kurtosis se
## X11 170.7244 22.2390 37 228 191 -0.943347983 3.641999614 1.3552419
## X12 184.2622 19.2738 114 246 132 -0.280403866 0.326046072 0.7733424
## X13 156.7670 22.2390 75 237 162 -0.014011136 -0.033082696 0.3542433
## X14 161.2236 22.2390 72 247 175 -0.122777144 -0.096571942 0.4100810
## X15 165.7959 22.2390 91 238 147 -0.170770997 -0.206646615 0.4665377
## X16 171.5212 23.7216 76 239 163 -0.310366669 0.352605142 0.6984535
## X17 176.9487 19.2738 118 240 122 -0.290540077 0.008686752 0.8849039
## X18 153.2575 22.2390 81 226 145 0.003089345 -0.036601838 0.6836331
stats::bartlett.test(G_SC~ rv, data=academic)
##
## Bartlett test of homogeneity of variances
##
## data: G_SC by rv
## Bartlett's K-squared = 14.006, df = 7, p-value = 0.05108
#p value > 0.5 , we can assume homogeneity.
userfriendlyscience::oneway(as.factor(academic$REVENUE),y=academic$G_SC,posthoc='Tukey')
## ### Oneway Anova for y=G_SC and x=REVENUE (groups: 0, 10 or more LMMW, Between 1 and less than 2 LMMW, Between 2 and less than 3 LMMW, Between 3 and less than 5 LMMW, Between 5 and less than 7 LMMW, Between 7 and less than 10 LMMW, less than 1 LMMW)
## Registered S3 methods overwritten by 'ufs':
## method from
## grid.draw.ggProportionPlot userfriendlyscience
## pander.associationMatrix userfriendlyscience
## pander.dataShape userfriendlyscience
## pander.descr userfriendlyscience
## pander.normalityAssessment userfriendlyscience
## print.CramersV userfriendlyscience
## print.associationMatrix userfriendlyscience
## print.confIntOmegaSq userfriendlyscience
## print.confIntV userfriendlyscience
## print.dataShape userfriendlyscience
## print.descr userfriendlyscience
## print.ggProportionPlot userfriendlyscience
## print.meanConfInt userfriendlyscience
## print.multiVarFreq userfriendlyscience
## print.normalityAssessment userfriendlyscience
## print.regrInfluential userfriendlyscience
## print.scaleDiagnosis userfriendlyscience
## print.scaleStructure userfriendlyscience
## print.scatterMatrix userfriendlyscience
## Omega squared: 95% CI = [.1; .12], point estimate = .11
## Eta Squared: 95% CI = [.1; .12], point estimate = .11
##
## SS Df MS F p
## Between groups (error + effect) 738464.9 7 105494.99 222.12 <.001
## Within groups (error only) 5890791.92 12403 474.95
##
##
## ### Post hoc test: Tukey
##
## diff lwr
## 10 or more LMMW-0 13.96 9.3
## Between 1 and less than 2 LMMW-0 -12.95 -17.04
## Between 2 and less than 3 LMMW-0 -8.77 -12.92
## Between 3 and less than 5 LMMW-0 -4.38 -8.57
## Between 5 and less than 7 LMMW-0 1.14 -3.34
## Between 7 and less than 10 LMMW-0 6.46 1.54
## less than 1 LMMW-0 -16.4 -20.86
## Between 1 and less than 2 LMMW-10 or more LMMW -26.9 -29.59
## Between 2 and less than 3 LMMW-10 or more LMMW -22.73 -25.5
## Between 3 and less than 5 LMMW-10 or more LMMW -18.34 -21.17
## Between 5 and less than 7 LMMW-10 or more LMMW -12.81 -16.06
## Between 7 and less than 10 LMMW-10 or more LMMW -7.5 -11.32
## less than 1 LMMW-10 or more LMMW -30.36 -33.57
## Between 2 and less than 3 LMMW-Between 1 and less than 2 LMMW 4.17 2.53
## Between 3 and less than 5 LMMW-Between 1 and less than 2 LMMW 8.57 6.81
## Between 5 and less than 7 LMMW-Between 1 and less than 2 LMMW 14.09 11.72
## Between 7 and less than 10 LMMW-Between 1 and less than 2 LMMW 19.41 16.29
## less than 1 LMMW-Between 1 and less than 2 LMMW -3.46 -5.77
## Between 3 and less than 5 LMMW-Between 2 and less than 3 LMMW 4.39 2.52
## Between 5 and less than 7 LMMW-Between 2 and less than 3 LMMW 9.92 7.46
## Between 7 and less than 10 LMMW-Between 2 and less than 3 LMMW 15.23 12.05
## less than 1 LMMW-Between 2 and less than 3 LMMW -7.63 -10.03
## Between 5 and less than 7 LMMW-Between 3 and less than 5 LMMW 5.52 2.99
## Between 7 and less than 10 LMMW-Between 3 and less than 5 LMMW 10.84 7.6
## less than 1 LMMW-Between 3 and less than 5 LMMW -12.02 -14.5
## Between 7 and less than 10 LMMW-Between 5 and less than 7 LMMW 5.32 1.7
## less than 1 LMMW-Between 5 and less than 7 LMMW -17.55 -20.49
## less than 1 LMMW-Between 7 and less than 10 LMMW -22.86 -26.44
## upr p adj
## 10 or more LMMW-0 18.62 <.001
## Between 1 and less than 2 LMMW-0 -8.85 <.001
## Between 2 and less than 3 LMMW-0 -4.63 <.001
## Between 3 and less than 5 LMMW-0 -0.19 .033
## Between 5 and less than 7 LMMW-0 5.63 .994
## Between 7 and less than 10 LMMW-0 11.38 .002
## less than 1 LMMW-0 -11.95 <.001
## Between 1 and less than 2 LMMW-10 or more LMMW -24.22 <.001
## Between 2 and less than 3 LMMW-10 or more LMMW -19.96 <.001
## Between 3 and less than 5 LMMW-10 or more LMMW -15.5 <.001
## Between 5 and less than 7 LMMW-10 or more LMMW -9.56 <.001
## Between 7 and less than 10 LMMW-10 or more LMMW -3.67 <.001
## less than 1 LMMW-10 or more LMMW -27.15 <.001
## Between 2 and less than 3 LMMW-Between 1 and less than 2 LMMW 5.81 <.001
## Between 3 and less than 5 LMMW-Between 1 and less than 2 LMMW 10.32 <.001
## Between 5 and less than 7 LMMW-Between 1 and less than 2 LMMW 16.46 <.001
## Between 7 and less than 10 LMMW-Between 1 and less than 2 LMMW 22.52 <.001
## less than 1 LMMW-Between 1 and less than 2 LMMW -1.15 <.001
## Between 3 and less than 5 LMMW-Between 2 and less than 3 LMMW 6.27 <.001
## Between 5 and less than 7 LMMW-Between 2 and less than 3 LMMW 12.38 <.001
## Between 7 and less than 10 LMMW-Between 2 and less than 3 LMMW 18.42 <.001
## less than 1 LMMW-Between 2 and less than 3 LMMW -5.22 <.001
## Between 5 and less than 7 LMMW-Between 3 and less than 5 LMMW 8.06 <.001
## Between 7 and less than 10 LMMW-Between 3 and less than 5 LMMW 14.08 <.001
## less than 1 LMMW-Between 3 and less than 5 LMMW -9.54 <.001
## Between 7 and less than 10 LMMW-Between 5 and less than 7 LMMW 8.93 <.001
## less than 1 LMMW-Between 5 and less than 7 LMMW -14.6 <.001
## less than 1 LMMW-Between 7 and less than 10 LMMW -19.29 <.001
res2<-stats::aov(G_SC~ REVENUE, data = academic)
fstat<-summary(res2)[[1]][["F value"]][[1]]
aovpvalue<-summary(res2)[[1]][["Pr(>F)"]][[1]]
aoveta<-sjstats::eta_sq(res2)[2]
A one-way between-groups analysis of variance (ANOVA) was conducted to explore the impact of global score on levels of revenue, as measured by the Life orientation Test (LOT). Participants were divided into eight groups according to their revenue (Group 1: 0 ; Group 2: 10 or more LMMW ; Group 3: Between 1 and less than 2 LMMW ; Group 4: Between 2 and less than 3 LMMW ; Group 5: Between 3 and less than 5 LMMW ; Group 6: Between 5 and less than 7 LMMW; Group 7: Between 7 and less than 10 LMMW ; Group 8: less than 1 LMMW). There was a statistically significant difference at the p < .05 level in global scores for the eight revenue groups: (F(2, 1.240310^{4})= 222.12, p<0.05. Despite reaching statistical significance, the actual difference in mean scores between groups was quite small. The effect size, calculated using eta squared was (0.11). Post-hoc comparisons using the Tukey HSD test indicated that the mean score for Group 1, Group 2, Group 3 and Group 5 were significantly different from each other. Group 1 and Group 4, Group 3 and Group 8, Group 5 and Group 6, Group 2 and Group 8 did not differ significantly from each other.
Build linear regression model1- Outcome variable: Global score ; Predictors: Mathematics and WC_PRO
H0: There is no relationship between global score and mathematics score, written communication score. H1: There is a relationship between global score and mathematics score, written communication score.
#Baseline model
model1=lm(academic$G_SC~academic$MAT_S11+academic$WC_PRO)
stargazer::stargazer(model1, type="text")
##
## ================================================
## Dependent variable:
## ----------------------------
## G_SC
## ------------------------------------------------
## MAT_S11 1.076***
## (0.011)
##
## WC_PRO 0.339***
## (0.004)
##
## Constant 75.304***
## (0.728)
##
## ------------------------------------------------
## Observations 12,411
## R2 0.600
## Adjusted R2 0.600
## Residual Std. Error 14.615 (df = 12408)
## F Statistic 9,314.261*** (df = 2; 12408)
## ================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
Reporting model 1 A simple linear regression was carried out to investigate whether written communication and mathematics score predicts global score. A significant regression equation was found (F(2,12408)= 14.615, p < .05), with R2 of 0.6. Global score = 75.30 + 1.08 * (MAT_S11) + 0.34 * (WC_PRO). Mathematics score and written communication score is significant predictor of global score.
The R2 value of 0.6 indicates that 60% of the variation in global score can be explained by the model containing mathematics score and written communication.
#Influential Outliers - Cook's distance
cooksd<-sort(cooks.distance(model1))
#Cook's distance
plot(cooksd, pch="*", cex=2, main="Influential Obsservations by Cooks distance")
abline(h = 4*mean(cooksd, na.rm=T), col="red")
text(x=1:length(cooksd)+1, y=cooksd, labels=ifelse(cooksd>4*mean(cooksd, na.rm=T),names(cooksd),""), col="red") # add labels
#Cook's distance is used to find the influential outliers in a set of predictor variables. It helps to identify the points that negatively affect regression model. Red line in the figure corresponds to the recommended threshold value (4 * mean). There are three Cook's distance values that are relatively higher than the others, which exceed the threshold value.
sum((cooks.distance(model1))>4*mean(cooksd))
## [1] 618
#618
#There are 618 outliers in the dataset i.e. 0.05% of the entire population.
influential <- as.numeric(names(cooksd)[(cooksd > 4*mean(cooksd, na.rm=T))])
stem(influential)
##
## The decimal point is 3 digit(s) to the right of the |
##
## 0 | 000111112222223333344455555566666678888888899
## 1 | 0000111111222333333333344444555555667888899999
## 2 | 000000001111233445566667777888899999
## 3 | 00111111222222233333344444444555567777778888899
## 4 | 001111111222333334444444455555556666777777788888999999999
## 5 | 00000011111223333333333444445566666666666667778888899999
## 6 | 00011111111222222223333444444444555555556666677777778888888888999999
## 7 | 0000000111122222234444444445567778889999999999
## 8 | 000111112223334444444555555667778889
## 9 | 000000011112222222333444444555556666666666667777778888888999
## 10 | 0000011111122334444455555666666677777777888888899
## 11 | 0111112222222222333333444444555556678888899999
## 12 | 0011111111222223333334444
head(academic[influential, ]) # influential observations.
## # A tibble: 6 x 46
## COD_S11 GENDER EDU_FATHER EDU_MOTHER OCC_FATHER OCC_MOTHER STRATUM SISBEN
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 SB1120… M Complete … Complete … Other occ… Home Stratu… Level…
## 2 SB1120… M 0 0 0 0 0 0
## 3 SB1120… F Complete … Complete … Technical… Technical… Stratu… Level…
## 4 SB1120… F Postgradu… Postgradu… Technical… Executive Stratu… It is…
## 5 SB1120… F Complete … Complete … Auxiliary… Small ent… Stratu… Level…
## 6 SB1120… M Postgradu… Complete … 0 Home Stratu… It is…
## # … with 38 more variables: PEOPLE_HOUSE <chr>, ...10 <lgl>, INTERNET <chr>,
## # TV <chr>, COMPUTER <chr>, WASHING_MCH <chr>, MIC_OVEN <chr>, CAR <chr>,
## # DVD <chr>, FRESH <chr>, PHONE <chr>, MOBILE <chr>, REVENUE <chr>,
## # JOB <chr>, SCHOOL_NAME <chr>, SCHOOL_NAT <chr>, SCHOOL_TYPE <chr>,
## # MAT_S11 <dbl>, CR_S11 <dbl>, CC_S11 <dbl>, BIO_S11 <dbl>, ENG_S11 <dbl>,
## # Cod_SPro <chr>, UNIVERSITY <chr>, ACADEMIC_PROGRAM <chr>, QR_PRO <dbl>,
## # CR_PRO <dbl>, CC_PRO <dbl>, ENG_PRO <dbl>, WC_PRO <dbl>, FEP_PRO <dbl>,
## # G_SC <dbl>, PERCENTILE <dbl>, `2ND_DECILE` <dbl>, QUARTILE <dbl>,
## # SEL <dbl>, SEL_IHE <dbl>, rv <dbl>
head(academic[influential, ]$G_SC)
## [1] 104 182 125 201 108 222
# 4 90 73 100 32 92 are influential observations of global score
head(academic[influential, ]$MAT_S11)
## [1] 47 49 49 100 52 86
# 47 49 49 100 52 86 are influential observations of maths
head(academic[influential, ]$WC_PRO)
## [1] 4 90 73 100 32 92
# 4 90 73 100 32 92 are influntial observations of written communication
car::outlierTest(model1)
## rstudent unadjusted p-value Bonferroni p
## 1336 -10.671216 1.8078e-26 2.2437e-22
## 7721 -5.458853 4.8844e-08 6.0620e-04
## 10858 -5.374858 7.8030e-08 9.6843e-04
## 2021 -4.799068 1.6126e-06 2.0014e-02
## 173 -4.758351 1.9738e-06 2.4497e-02
#Observations 1336, 7721, 10858, 2021, 173 are outliers
car::leveragePlots(model1) # leverage plots
plot(model1,1)
#Red line is horizontally drawn near 0, it indicates the presence of linear model.
#Homocedasticity
plot(model1, 3)
#These plot helps us to identify the homoscedasticity of the model. Since the data is not completely random distribution of points throughout the range of X axis and flat red line, we can conclude that graph is homoscedasticity.
#Create histogram and density plot of the residuals
plot(density(resid(model1)))
car::qqPlot(model1, main="QQ Plot") #qq plot for studentized resid
## [1] 1336 7721
#Collinearity
vifmodel<-car::vif(model1)
vifmodel
## academic$MAT_S11 academic$WC_PRO
## 1.044855 1.044855
#tolerance
1/vifmodel
## academic$MAT_S11 academic$WC_PRO
## 0.9570702 0.9570702
Build linear regression model2- Outcome variable: Global score ; Predictors: Mathematics, WC_PRO and Revenue
H0: There is no relationship between global score and mathematics score, written communication score, revenue. H1: There is a relationship between global score and mathematics score, written communication score, revenue.
model2=lm(academic$G_SC~academic$MAT_S11+academic$WC_PRO+academic$rv)
summary(model2)
##
## Call:
## lm(formula = academic$G_SC ~ academic$MAT_S11 + academic$WC_PRO +
## academic$rv)
##
## Residuals:
## Min 1Q Median 3Q Max
## -155.739 -9.657 0.605 9.888 66.259
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 75.97773 0.80221 94.711 <2e-16 ***
## academic$MAT_S11 1.07564 0.01129 95.250 <2e-16 ***
## academic$WC_PRO 0.33918 0.00447 75.880 <2e-16 ***
## academic$rv -0.15539 0.07763 -2.002 0.0453 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.61 on 12407 degrees of freedom
## Multiple R-squared: 0.6003, Adjusted R-squared: 0.6002
## F-statistic: 6212 on 3 and 12407 DF, p-value: < 2.2e-16
anova(model2)
## Analysis of Variance Table
##
## Response: academic$G_SC
## Df Sum Sq Mean Sq F value Pr(>F)
## academic$MAT_S11 1 2748008 2748008 12868.6032 < 2e-16 ***
## academic$WC_PRO 1 1230958 1230958 5764.4332 < 2e-16 ***
## academic$rv 1 856 856 4.0073 0.04533 *
## Residuals 12407 2649435 214
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Reporting Multiple linear regression was carried out to investigate the relationship between mathematics score, written communication score, revenue and global score. There was a significant relationship between mathematics score and global score (p < .001), written communication and global score (p < .001) and revenue and global score( p < .001). A significant regression equation was found (F(3,12407)= 6212, p < .05), Global score = 75.98 + 1.08 * (MAT_S11) + 0.33 * (WC_PRO) - 0.16(REVENUE) with R2 of 0.6. Mathematics score, written communication score and revenue are significant predictors of global score.
Revenue is converted as numeric factors. The R2 value of 0.6 indicates that 60% of the variation in global score can be explained by the model containing mathematics score, written communication and revenue.
#Influential Outliers - Cook's distance
cooksd<-sort(cooks.distance(model2))
#Cook's distance
plot(cooksd, pch="*", cex=2, main="Influential Obsservations by Cooks distance")
abline(h = 4*mean(cooksd, na.rm=T), col="red")
text(x=1:length(cooksd)+1, y=cooksd, labels=ifelse(cooksd>4*mean(cooksd, na.rm=T),names(cooksd),""), col="red") # add labels
#Cook's distance is used to find the influential outliers in a set of predictor variables. It helps to identify the points that negatively affect regression model. Red line in the figure corresponds to the recommended threshold value (4 * mean). There are three Cook's distance values that are relatively higher than the others, which exceed the threshold value.
sum((cooks.distance(model2))>4*mean(cooksd))
## [1] 619
#619
#There are 619 outliers in the dataset i.e. 0.05% of the entire population.
influential <- as.numeric(names(cooksd)[(cooksd > 4*mean(cooksd, na.rm=T))])
stem(influential)
##
## The decimal point is 3 digit(s) to the right of the |
##
## 0 | 000011111122222333344445556666667788888999
## 1 | 00111111122222333333333334444455555556666778888999999
## 2 | 00000011112334445555566666667778889999
## 3 | 001111111122222333333344444444455555566777777888899
## 4 | 0001111111122222333333444444455555555666677777778888899999999
## 5 | 0000001111112223333333334445555666666666666777788889999
## 6 | 00001111112222222333444444555555555566667777777788888999999999
## 7 | 00011111222223334444444445567778888999999999
## 8 | 0001111112333344444455556677778889
## 9 | 0000000111111122222233334444455666666666677777788888888899
## 10 | 0000001111112333344445566666677777777888888888999
## 11 | 001111111222222222333334444445555555667888999999
## 12 | 001111111111222333334444
head(academic[influential, ]) # influential observations.
## # A tibble: 6 x 46
## COD_S11 GENDER EDU_FATHER EDU_MOTHER OCC_FATHER OCC_MOTHER STRATUM SISBEN
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 SB1120… F Postgradu… Complete … Technical… Home Stratu… Level…
## 2 SB1120… M Incomplet… Incomplet… Other occ… Home Stratu… Level…
## 3 SB1120… M Complete … Postgradu… Independe… Technical… Stratu… Level…
## 4 SB1120… M Complete … Complete … Independe… Operator Stratu… Level…
## 5 SB1120… F Complete … Complete … Technical… Auxiliary… Stratu… It is…
## 6 SB1120… M Postgradu… Postgradu… Independe… Independe… Stratu… It is…
## # … with 38 more variables: PEOPLE_HOUSE <chr>, ...10 <lgl>, INTERNET <chr>,
## # TV <chr>, COMPUTER <chr>, WASHING_MCH <chr>, MIC_OVEN <chr>, CAR <chr>,
## # DVD <chr>, FRESH <chr>, PHONE <chr>, MOBILE <chr>, REVENUE <chr>,
## # JOB <chr>, SCHOOL_NAME <chr>, SCHOOL_NAT <chr>, SCHOOL_TYPE <chr>,
## # MAT_S11 <dbl>, CR_S11 <dbl>, CC_S11 <dbl>, BIO_S11 <dbl>, ENG_S11 <dbl>,
## # Cod_SPro <chr>, UNIVERSITY <chr>, ACADEMIC_PROGRAM <chr>, QR_PRO <dbl>,
## # CR_PRO <dbl>, CC_PRO <dbl>, ENG_PRO <dbl>, WC_PRO <dbl>, FEP_PRO <dbl>,
## # G_SC <dbl>, PERCENTILE <dbl>, `2ND_DECILE` <dbl>, QUARTILE <dbl>,
## # SEL <dbl>, SEL_IHE <dbl>, rv <dbl>
head(academic[influential, ]$G_SC)
## [1] 205 154 114 140 191 227
# 205 154 114 140 191 227 are influential observations of global score
head(academic[influential, ]$MAT_S11)
## [1] 60 81 45 56 96 91
# 60 81 45 56 96 91 are influential observations of maths
head(academic[influential, ]$WC_PRO)
## [1] 58 38 34 80 96 98
# 58 38 34 80 96 98 are influntial observations of written communication
car::outlierTest(model2)
## rstudent unadjusted p-value Bonferroni p
## 1336 -10.709375 1.2025e-26 1.4925e-22
## 7721 -5.463285 4.7642e-08 5.9129e-04
## 10858 -5.336734 9.6302e-08 1.1952e-03
## 2021 -4.813386 1.5014e-06 1.8634e-02
## 173 -4.772609 1.8393e-06 2.2828e-02
#Observations 1336, 7721 are outliers
car::leveragePlots(model2) # leverage plots
plot(model2,1)
#Red line is horizontally drawn near 0, it indicates the presence of linear model.
#Homocedasticity
plot(model2, 3)
#These plot helps us to identify the homoscedasticity of the model. Since the data is not completely random distribution of points throughout the range of X axis and flat red line, we can conclude that graph is homoscedasticity.
#Also, the has horizontal line with equally spread of
#Create histogram and density plot of the residuals
plot(density(resid(model1)))
car::qqPlot(model1, main="QQ Plot") #qq plot for studentized resid
## [1] 1336 7721
#Collinearity
vifmodel<-car::vif(model2)
vifmodel
## academic$MAT_S11 academic$WC_PRO academic$rv
## 1.044858 1.045145 1.000282
#tolerance
1/vifmodel
## academic$MAT_S11 academic$WC_PRO academic$rv
## 0.9570683 0.9568046 0.9997182
Differential effect : Adding a term TV to understand differential effect.
Build linear regression model3 - Outcome variable: Global score ; Predictors: Mathematics, WC_PRO, Revenue and TV
H0: There is no relationship between global score and mathematics score, written communication score, revenue, TV. H1: There is a relationship between global score and mathematics score, written communication score, revenue, TV.
#Model 3 adding in TV
academic$tvf=ifelse(academic$TV == "Yes", 1, ifelse(academic$TV == "No", 0, NA))
model3=lm(academic$G_SC~academic$MAT_S11+academic$WC_PRO+academic$rv+academic$tvf)
summary(model3)
##
## Call:
## lm(formula = academic$G_SC ~ academic$MAT_S11 + academic$WC_PRO +
## academic$rv + academic$tvf)
##
## Residuals:
## Min 1Q Median 3Q Max
## -155.804 -9.678 0.670 9.831 65.841
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 73.541502 0.845204 87.010 <2e-16 ***
## academic$MAT_S11 1.065957 0.011310 94.251 <2e-16 ***
## academic$WC_PRO 0.338026 0.004458 75.829 <2e-16 ***
## academic$rv -0.085373 0.077780 -1.098 0.272
## academic$tvf 3.311155 0.371888 8.904 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.57 on 12406 degrees of freedom
## Multiple R-squared: 0.6029, Adjusted R-squared: 0.6028
## F-statistic: 4708 on 4 and 12406 DF, p-value: < 2.2e-16
anova(model3)
## Analysis of Variance Table
##
## Response: academic$G_SC
## Df Sum Sq Mean Sq F value Pr(>F)
## academic$MAT_S11 1 2748008 2748008 12949.7903 < 2e-16 ***
## academic$WC_PRO 1 1230958 1230958 5800.8006 < 2e-16 ***
## academic$rv 1 856 856 4.0325 0.04465 *
## academic$tvf 1 16823 16823 79.2749 < 2e-16 ***
## Residuals 12406 2632613 212
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Reporting Multiple linear regression was carried out to investigate the relationship between mathematics score, written communication score, revenue, TV and global score. There was a significant relationship between mathematics score and global score (p < .001), written communication and global score (p < .001), TV and revenue and global score( p < .001). A significant regression equation was found (F(4,12406)= 3001, p < .05), with R2 of 0.6. Mathematics score, written communication score and revenue are significant predictors of global score.
#Influential Outliers - Cook's distance
cooksd<-sort(cooks.distance(model3))
#Cook's distance
plot(cooksd, pch="*", cex=2, main="Influential Obsservations by Cooks distance")
abline(h = 4*mean(cooksd, na.rm=T), col="red")
text(x=1:length(cooksd)+1, y=cooksd, labels=ifelse(cooksd>4*mean(cooksd, na.rm=T),names(cooksd),""), col="red")
sum((cooks.distance(model2))>4*mean(cooksd))
## [1] 629
#195
#There are only 195 outliers in the dataset i.e. 0.015% of the entire population.
influential <- as.numeric(names(cooksd)[(cooksd > 4*mean(cooksd, na.rm=T))])
stem(influential)
##
## The decimal point is 3 digit(s) to the right of the |
##
## 0 | 0000011111222233333444455555566666777788888889999
## 1 | 0001111112222223333333333344444444555555556666677777788888899999999
## 2 | 00000001111122233334445555566666677777788899999
## 3 | 00011111111112222222333333334444444444555555666677777788888889999
## 4 | 0000001111111122222233333444444455556666777777888888999999
## 5 | 0000011112233333445556666666667777788888999
## 6 | 00001111112222223334444444555555555666677777777778888889999999999
## 7 | 0001111222233444555677788999999999
## 8 | 0000111111223344444455555556777889
## 9 | 0000000111122222233334444455566666666667777778888888889
## 10 | 000000111223344455556666677777788888888889999
## 11 | 000111111222222222333344444445555567788899999999
## 12 | 000111111111223333444
head(academic[influential, ]) # influential observations.
## # A tibble: 6 x 47
## COD_S11 GENDER EDU_FATHER EDU_MOTHER OCC_FATHER OCC_MOTHER STRATUM SISBEN
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 SB1120… M Complete … Postgradu… Executive Other occ… Stratu… It is…
## 2 SB1120… M Incomplet… Incomplet… Small ent… Other occ… Stratu… It is…
## 3 SB1120… M Postgradu… Complete … Technical… Technical… Stratu… Level…
## 4 SB1120… M Complete … Complete … Technical… Home Stratu… It is…
## 5 SB1120… M Incomplet… Complete … Operator Independe… Stratu… It is…
## 6 SB1120… F Incomplet… Incomplet… 0 0 Stratu… Esta …
## # … with 39 more variables: PEOPLE_HOUSE <chr>, ...10 <lgl>, INTERNET <chr>,
## # TV <chr>, COMPUTER <chr>, WASHING_MCH <chr>, MIC_OVEN <chr>, CAR <chr>,
## # DVD <chr>, FRESH <chr>, PHONE <chr>, MOBILE <chr>, REVENUE <chr>,
## # JOB <chr>, SCHOOL_NAME <chr>, SCHOOL_NAT <chr>, SCHOOL_TYPE <chr>,
## # MAT_S11 <dbl>, CR_S11 <dbl>, CC_S11 <dbl>, BIO_S11 <dbl>, ENG_S11 <dbl>,
## # Cod_SPro <chr>, UNIVERSITY <chr>, ACADEMIC_PROGRAM <chr>, QR_PRO <dbl>,
## # CR_PRO <dbl>, CC_PRO <dbl>, ENG_PRO <dbl>, WC_PRO <dbl>, FEP_PRO <dbl>,
## # G_SC <dbl>, PERCENTILE <dbl>, `2ND_DECILE` <dbl>, QUARTILE <dbl>,
## # SEL <dbl>, SEL_IHE <dbl>, rv <dbl>, tvf <dbl>
head(academic[influential, ]$G_SC)
## [1] 123 138 214 175 135 188
#228 224 102 183 196 203 are influential observations of global score
head(academic[influential, ]$MAT_S11)
## [1] 65 67 73 64 53 52
#81 89 57 68 55 79 are influential observations of maths
head(academic[influential, ]$WC_PRO)
## [1] 0 87 72 10 76 99
#100 94 0 88 96 89 are influntial observations of written communication
car::outlierTest(model3)
## rstudent unadjusted p-value Bonferroni p
## 1336 -10.747996 7.9486e-27 9.8650e-23
## 7721 -5.512795 3.6024e-08 4.4709e-04
## 10858 -5.407799 6.4987e-08 8.0655e-04
## 2021 -4.868291 1.1395e-06 1.4143e-02
## 173 -4.814642 1.4920e-06 1.8518e-02
#1336, 12376, 7721, 3718, 6980 are the outcome variables that have unusual variables for its predictor values.
car::leveragePlots(model3) # leverage plots
plot(model3,1)
#Red line is horizontally drawn near 0, it indicates the presence of linear model.
#Homocedasticity
plot(model3, 3)
#These plot helps us to identify the homoscedasticity of the model. Since the data is not completely random distribution of points throughout the range of X axis and flat red line, we can conclude that graph is homoscedasticity.
#Also, the plot has horizontal line with equally spread of observations.
#Create histogram and density plot of the residuals
plot(density(resid(model3)))
car::qqPlot(model3, main="QQ Plot") #qq plot for studentized resid
## [1] 1336 7721
#Collinearity
vifmodel<-car::vif(model3)
vifmodel
## academic$MAT_S11 academic$WC_PRO academic$rv academic$tvf
## 1.054605 1.046026 1.010613 1.022315
#tolerance
1/vifmodel
## academic$MAT_S11 academic$WC_PRO academic$rv academic$tvf
## 0.9482223 0.9559996 0.9894986 0.9781722
Interaction effect: Adding a term TV * MAT_S11 to understand interaction effect.
Build linear regression model4 - Outcome variable: Global score ; Predictors: Mathematics, WC_PRO, Revenue, TV, (TV*MAT_S11)
H0: There is no relationship between global score and mathematics score, written communication score, revenue, TV, (TVMAT_S11). H1: There is a relationship between global score and mathematics score, written communication score, revenue, TV, (TV MAT_S11).
academic$variable <- (academic$tvf) * academic$MAT_S11
model4=lm(academic$G_SC~academic$MAT_S11+academic$WC_PRO+academic$rv+academic$tvf+academic$variable)
summary(model4)
##
## Call:
## lm(formula = academic$G_SC ~ academic$MAT_S11 + academic$WC_PRO +
## academic$rv + academic$tvf + academic$variable)
##
## Residuals:
## Min 1Q Median 3Q Max
## -155.783 -9.688 0.669 9.830 65.832
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 73.021149 1.926271 37.908 <2e-16 ***
## academic$MAT_S11 1.074404 0.030290 35.471 <2e-16 ***
## academic$WC_PRO 0.338025 0.004458 75.826 <2e-16 ***
## academic$rv -0.084939 0.077797 -1.092 0.2749
## academic$tvf 3.914812 2.042181 1.917 0.0553 .
## academic$variable -0.009759 0.032464 -0.301 0.7637
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.57 on 12405 degrees of freedom
## Multiple R-squared: 0.6029, Adjusted R-squared: 0.6027
## F-statistic: 3767 on 5 and 12405 DF, p-value: < 2.2e-16
anova(model4)
## Analysis of Variance Table
##
## Response: academic$G_SC
## Df Sum Sq Mean Sq F value Pr(>F)
## academic$MAT_S11 1 2748008 2748008 12948.8408 < 2e-16 ***
## academic$WC_PRO 1 1230958 1230958 5800.3753 < 2e-16 ***
## academic$rv 1 856 856 4.0322 0.04466 *
## academic$tvf 1 16823 16823 79.2691 < 2e-16 ***
## academic$variable 1 19 19 0.0904 0.76371
## Residuals 12405 2632594 212
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Cook's distance
cooksd<-sort(cooks.distance(model4))
plot(cooksd, pch="*", cex=2, main="Influential observation by Cooks distance")
abline(h = 4*mean(cooksd, na.rm=T), col="red") # add cutoff line
text(x=1:length(cooksd)+1, y=cooksd, labels=ifelse(cooksd>4*mean(cooksd, na.rm=T),names(cooksd),""), col="red") # add labels
sum((cooks.distance(model4))>4*mean(cooksd))
## [1] 610
#610
#610 observations in the population are outliers.
influential <- as.numeric(names(cooksd)[(cooksd > 4*mean(cooksd, na.rm=T))]) # influential row numbers
stem(influential)
##
## The decimal point is 3 digit(s) to the right of the |
##
## 0 | 00000111112222233334444455556666677777888888889999
## 1 | 00011111122222333333333444444444555555566666777777788888889999999
## 2 | 00000001111111223333444555566666777778889999
## 3 | 00111111112222222233333333333444444444555555566667777788888888899999
## 4 | 00000000111111111122222222233334444455667778888889999
## 5 | 000111123333344555556666666777778888999
## 6 | 000011111222233334444445555555566777777777788888899999999
## 7 | 0000111122223344455667777888999999999
## 8 | 0000011111122234444555555666777788899
## 9 | 00000001112222222233334444445556666666677777777888888889
## 10 | 00000011223445556666677777888888889999
## 11 | 001111112222222233334444555556677788899999999
## 12 | 000111111112223334444
head(academic[influential, ]) # influential observations.
## # A tibble: 6 x 48
## COD_S11 GENDER EDU_FATHER EDU_MOTHER OCC_FATHER OCC_MOTHER STRATUM SISBEN
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 SB1120… F Incomplet… Complete … Independe… Home Stratu… Level…
## 2 SB1120… M Postgradu… Incomplet… 0 Independe… Stratu… It is…
## 3 SB1120… M Complete … Complete … Independe… Auxiliary… Stratu… It is…
## 4 SB1120… M Complete … Incomplet… Retired Home Stratu… It is…
## 5 SB1120… F Complete … Complete … Independe… Small ent… Stratu… Level…
## 6 SB1120… M Incomplet… Complete … Entrepren… Independe… Stratu… It is…
## # … with 40 more variables: PEOPLE_HOUSE <chr>, ...10 <lgl>, INTERNET <chr>,
## # TV <chr>, COMPUTER <chr>, WASHING_MCH <chr>, MIC_OVEN <chr>, CAR <chr>,
## # DVD <chr>, FRESH <chr>, PHONE <chr>, MOBILE <chr>, REVENUE <chr>,
## # JOB <chr>, SCHOOL_NAME <chr>, SCHOOL_NAT <chr>, SCHOOL_TYPE <chr>,
## # MAT_S11 <dbl>, CR_S11 <dbl>, CC_S11 <dbl>, BIO_S11 <dbl>, ENG_S11 <dbl>,
## # Cod_SPro <chr>, UNIVERSITY <chr>, ACADEMIC_PROGRAM <chr>, QR_PRO <dbl>,
## # CR_PRO <dbl>, CC_PRO <dbl>, ENG_PRO <dbl>, WC_PRO <dbl>, FEP_PRO <dbl>,
## # G_SC <dbl>, PERCENTILE <dbl>, `2ND_DECILE` <dbl>, QUARTILE <dbl>,
## # SEL <dbl>, SEL_IHE <dbl>, rv <dbl>, tvf <dbl>, variable <dbl>
car::outlierTest(model4)
## rstudent unadjusted p-value Bonferroni p
## 1336 -10.746246 8.0996e-27 1.0052e-22
## 7721 -5.512041 3.6178e-08 4.4901e-04
## 10858 -5.408441 6.4755e-08 8.0368e-04
## 2021 -4.870049 1.1295e-06 1.4018e-02
## 173 -4.814685 1.4917e-06 1.8514e-02
#1336, 7721, 10858, 2021, 173 are the outcome variables that have unusual variables for its predictor values.
car::leveragePlots(model4)
plot(model4,1)
#Red line is horizontally drawn near 0, it indicates the presence of linear model.
#Homocedasticity
plot(model4, 3)
#These plot helps us to identify the homoscedasticity of the model. Since the data is not completely random distribution of points throughout the range of X axis and flat red line, we can conclude that graph is homoscedasticity.
#Also, the plot has horizontal line with equally spread of observations.
#Create histogram and a density plot of the residuals
plot(density(resid(model4)))
car::qqPlot(model4, main="QQ Plot Model 4") #qq plot for studentized resid
## [1] 1336 7721
#Collinearity
vifmodel<-car::vif(model4)
vifmodel
## academic$MAT_S11 academic$WC_PRO academic$rv academic$tvf
## 7.563800 1.046026 1.010961 30.826055
## academic$variable
## 40.186636
#Tolerance
1/vifmodel
## academic$MAT_S11 academic$WC_PRO academic$rv academic$tvf
## 0.13220868 0.95599914 0.98915809 0.03244009
## academic$variable
## 0.02488389
Reporting Multiple linear regression was carried out to investigate the relationship between mathematics score, written communication score, revenue, TV, interaction term (TV*Maths score) and global score. There was a significant relationship between mathematics score and global score (p < .001), written communication and global score (p < .001), TV and global score (p < .001), and revenue and global score( p < .001). No statistical significant relationship was found between interaction term and global score(p=.76) A significant regression equation was found (F(5,12405)= 3767, p < .05), with R2 of 0.6.
m1 <- lm(academic$G_SC~+academic$WC_PRO+academic$rv+academic$variable)
vifmodel<-car::vif(m1)
vifmodel
## academic$WC_PRO academic$rv academic$variable
## 1.016527 1.007323 1.023649
#tolerance
1/vifmodel
## academic$WC_PRO academic$rv academic$variable
## 0.9837413 0.9927298 0.9768973
Huge difference in variance inflation factor was found when the variables that contributed to interaction term were eliminated.